EDA

Import the dataset, explore and summarize it


In [11]:
# load the necessary python modules
import matplotlib.pyplot as plt
import matplotlib
import pickle
import pandas as pd 
import numpy as np
from IPython.display import display
%matplotlib notebook

In [12]:
### Load the dictionary containing the dataset. This code taken from poi_id.py script provided by udacity. 
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [13]:
# get some initial stats for the project report.
print("Total Number of persons: %d"%len(data_dict.keys()))
print("Total Number of features: %d"%len(list(data_dict.values())[0]))
print("Total Number of POIs: %d"%sum([1 if x['poi'] else 0 for x in data_dict.values()]))


Total Number of persons: 146
Total Number of features: 21
Total Number of POIs: 18

In [14]:
print data_dict.keys()


['METTS MARK', 'BAXTER JOHN C', 'ELLIOTT STEVEN', 'CORDES WILLIAM R', 'HANNON KEVIN P', 'MORDAUNT KRISTINA M', 'MEYER ROCKFORD G', 'MCMAHON JEFFREY', 'HORTON STANLEY C', 'PIPER GREGORY F', 'HUMPHREY GENE E', 'UMANOFF ADAM S', 'BLACHMAN JEREMY M', 'SUNDE MARTIN', 'GIBBS DANA R', 'LOWRY CHARLES P', 'COLWELL WESLEY', 'MULLER MARK S', 'JACKSON CHARLENE R', 'WESTFAHL RICHARD K', 'WALTERS GARETH W', 'WALLS JR ROBERT H', 'KITCHEN LOUISE', 'CHAN RONNIE', 'BELFER ROBERT', 'SHANKMAN JEFFREY A', 'WODRASKA JOHN', 'BERGSIEKER RICHARD P', 'URQUHART JOHN A', 'BIBI PHILIPPE A', 'RIEKER PAULA H', 'WHALEY DAVID A', 'BECK SALLY W', 'HAUG DAVID L', 'ECHOLS JOHN B', 'MENDELSOHN JOHN', 'HICKERSON GARY J', 'CLINE KENNETH W', 'LEWIS RICHARD', 'HAYES ROBERT E', 'MCCARTY DANNY J', 'KOPPER MICHAEL J', 'LEFF DANIEL P', 'LAVORATO JOHN J', 'BERBERIAN DAVID', 'DETMERING TIMOTHY J', 'WAKEHAM JOHN', 'POWERS WILLIAM', 'GOLD JOSEPH', 'BANNANTINE JAMES M', 'DUNCAN JOHN H', 'SHAPIRO RICHARD S', 'SHERRIFF JOHN R', 'SHELBY REX', 'LEMAISTRE CHARLES', 'DEFFNER JOSEPH M', 'KISHKILL JOSEPH G', 'WHALLEY LAWRENCE G', 'MCCONNELL MICHAEL S', 'PIRO JIM', 'DELAINEY DAVID W', 'SULLIVAN-SHAKLOVITZ COLLEEN', 'WROBEL BRUCE', 'LINDHOLM TOD A', 'MEYER JEROME J', 'LAY KENNETH L', 'BUTTS ROBERT H', 'OLSON CINDY K', 'MCDONALD REBECCA', 'CUMBERLAND MICHAEL S', 'GAHN ROBERT S', 'MCCLELLAN GEORGE', 'HERMANN ROBERT J', 'SCRIMSHAW MATTHEW', 'GATHMANN WILLIAM D', 'HAEDICKE MARK E', 'BOWEN JR RAYMOND M', 'GILLIS JOHN', 'FITZGERALD JAY L', 'MORAN MICHAEL P', 'REDMOND BRIAN L', 'BAZELIDES PHILIP J', 'BELDEN TIMOTHY N', 'DURAN WILLIAM D', 'THORN TERENCE H', 'FASTOW ANDREW S', 'FOY JOE', 'CALGER CHRISTOPHER F', 'RICE KENNETH D', 'KAMINSKI WINCENTY J', 'LOCKHART EUGENE E', 'COX DAVID', 'OVERDYKE JR JERE C', 'PEREIRA PAULO V. FERRAZ', 'STABLER FRANK', 'SKILLING JEFFREY K', 'BLAKE JR. NORMAN P', 'SHERRICK JEFFREY B', 'PRENTICE JAMES', 'GRAY RODNEY', 'PICKERING MARK R', 'THE TRAVEL AGENCY IN THE PARK', 'NOLES JAMES L', 'KEAN STEVEN J', 'TOTAL', 'FOWLER PEGGY', 'WASAFF GEORGE', 'WHITE JR THOMAS E', 'CHRISTODOULOU DIOMEDES', 'ALLEN PHILLIP K', 'SHARP VICTORIA T', 'JAEDICKE ROBERT', 'WINOKUR JR. HERBERT S', 'BROWN MICHAEL', 'BADUM JAMES P', 'HUGHES JAMES A', 'REYNOLDS LAWRENCE', 'DIMICHELE RICHARD G', 'BHATNAGAR SANJAY', 'CARTER REBECCA C', 'BUCHANAN HAROLD G', 'YEAP SOON', 'MURRAY JULIA H', 'GARLAND C KEVIN', 'DODSON KEITH', 'YEAGER F SCOTT', 'HIRKO JOSEPH', 'DIETRICH JANET R', 'DERRICK JR. JAMES V', 'FREVERT MARK A', 'PAI LOU L', 'BAY FRANKLIN R', 'HAYSLETT RODERICK J', 'FUGH JOHN L', 'FALLON JAMES B', 'KOENIG MARK E', 'SAVAGE FRANK', 'IZZO LAWRENCE L', 'TILNEY ELIZABETH A', 'MARTIN AMANDA K', 'BUY RICHARD B', 'GRAMM WENDY L', 'CAUSEY RICHARD A', 'TAYLOR MITCHELL S', 'DONAHUE JR JEFFREY M', 'GLISAN JR BEN F']

In [15]:
# converting the dictionary dataset to a pandas dataframe
enron_df = pd.DataFrame.from_dict(data_dict)
# Removing entries belonging to Total and THE TRAVEL AGENCY IN THE PARK as they are non persons
del enron_df['TOTAL']
del enron_df['THE TRAVEL AGENCY IN THE PARK']
enron_df = enron_df.transpose()

enron_df_num = enron_df.apply(pd.to_numeric, errors='coerce')
# Removing the email_address from the dataset as it's non-numeric feature and won't seem to have much use right now.
del enron_df_num['email_address']

enron_df_num.describe()


Out[15]:
bonus deferral_payments deferred_income director_fees exercised_stock_options expenses from_messages from_poi_to_this_person from_this_person_to_poi loan_advances long_term_incentive other restricted_stock restricted_stock_deferred salary shared_receipt_with_poi to_messages total_payments total_stock_value
count 8.100000e+01 3.800000e+01 4.800000e+01 16.000000 1.010000e+02 94.000000 86.000000 86.000000 86.000000 3.000000e+00 6.500000e+01 9.100000e+01 1.090000e+02 1.700000e+01 9.400000e+01 86.000000 86.000000 1.230000e+02 1.250000e+02
mean 1.201773e+06 8.416025e+05 -5.810498e+05 89822.875000 2.959559e+06 54192.010638 608.790698 64.895349 41.232558 2.797500e+07 7.464912e+05 4.664105e+05 1.147424e+06 6.218928e+05 2.840875e+05 1176.465116 2073.860465 2.641806e+06 3.352073e+06
std 1.441679e+06 1.289323e+06 9.420764e+05 41112.700735 5.499450e+06 46108.377454 1841.033949 86.979244 100.073111 4.638256e+07 8.629174e+05 1.397376e+06 2.249770e+06 3.845528e+06 1.771311e+05 1178.317641 2582.700981 9.524694e+06 6.532883e+06
min 7.000000e+04 -1.025000e+05 -3.504386e+06 3285.000000 3.285000e+03 148.000000 12.000000 0.000000 0.000000 4.000000e+05 6.922300e+04 2.000000e+00 -2.604490e+06 -1.787380e+06 4.770000e+02 2.000000 57.000000 1.480000e+02 -4.409300e+04
25% 4.250000e+05 7.964450e+04 -6.112092e+05 83674.500000 5.067650e+05 22479.000000 22.750000 10.000000 1.000000 1.200000e+06 2.750000e+05 1.203000e+03 2.520550e+05 -3.298250e+05 2.118020e+05 249.750000 541.250000 3.969340e+05 4.941360e+05
50% 7.500000e+05 2.210635e+05 -1.519270e+05 106164.500000 1.297049e+06 46547.500000 41.000000 35.000000 8.000000 2.000000e+06 4.221580e+05 5.158700e+04 4.410960e+05 -1.402640e+05 2.587410e+05 740.500000 1211.000000 1.101393e+06 1.095040e+06
75% 1.200000e+06 8.672112e+05 -3.792600e+04 112815.000000 2.542813e+06 78408.500000 145.500000 72.250000 24.750000 4.176250e+07 8.318090e+05 3.319830e+05 9.850320e+05 -7.241900e+04 3.086065e+05 1888.250000 2634.750000 2.087530e+06 2.606763e+06
max 8.000000e+06 6.426990e+06 -8.330000e+02 137864.000000 3.434838e+07 228763.000000 14368.000000 528.000000 609.000000 8.152500e+07 5.145434e+06 1.035973e+07 1.476169e+07 1.545629e+07 1.111258e+06 5521.000000 15149.000000 1.035598e+08 4.911008e+07

In [16]:
len(enron_df_num)


Out[16]:
144

We are left with 144 records now in our dataframe.

Also, the summary of the data sets shows some shows a very large standard deviation for some of the features and some missing data for others. We will drop some of these features as below.


In [17]:
del enron_df_num['loan_advances']
del enron_df_num['restricted_stock_deferred']
del enron_df_num['director_fees']

In [18]:
# Feature selections
data_corr_list = enron_df_num.corr()
print('\nCorrelations between features to POI:\n ' +str(data_corr_list['poi']))


Correlations between features to POI:
 bonus                      0.302384
deferral_payments         -0.098428
deferred_income           -0.265698
exercised_stock_options    0.503551
expenses                   0.060292
from_messages             -0.074308
from_poi_to_this_person    0.167722
from_this_person_to_poi    0.112940
long_term_incentive        0.254723
other                      0.120270
poi                        1.000000
restricted_stock           0.224814
salary                     0.264976
shared_receipt_with_poi    0.228313
to_messages                0.058954
total_payments             0.230102
total_stock_value          0.366462
Name: poi, dtype: float64

Features ‘exercised_stock_options’, ‘total_stock_value’, and ‘bonus’ have the highest correlation to POI, in descending order.


In [19]:
#Get rid of label
del enron_df_num['poi']
poi = enron_df['poi']

#Create new features
enron_df_num['stock_sum'] = enron_df_num['exercised_stock_options'] +\
                           enron_df_num['total_stock_value'] +\
                           enron_df_num['restricted_stock'] 
enron_df_num['stock_ratio'] = enron_df_num['exercised_stock_options']/enron_df_num['total_stock_value']
enron_df_num['money_total'] = enron_df_num['salary'] +\
                             enron_df_num['bonus'] -\
                             enron_df_num['expenses']
enron_df_num['money_ratio'] = enron_df_num['bonus']/enron_df_num['salary'] 
enron_df_num['email_ratio'] = enron_df_num['from_messages']/(enron_df_num['to_messages']+enron_df_num['from_messages'])
enron_df_num['poi_email_ratio_from'] = enron_df_num['from_poi_to_this_person']/enron_df_num['to_messages']
enron_df_num['poi_email_ratio_to'] = enron_df_num['from_this_person_to_poi']/enron_df_num['from_messages']

#Feel in NA values with 'marker' value outside range of real values
enron_df_num = enron_df_num.fillna(enron_df_num.mean())

#Scale to 1-0
enron_df_num = (enron_df_num-enron_df_num.min())/(enron_df_num.max()-enron_df_num.min())

In [20]:
from sklearn.feature_selection import SelectKBest
selector = SelectKBest()
selector.fit(enron_df_num,poi.tolist())
scores = {enron_df_num.columns[i]:selector.scores_[i] for i in range(len(enron_df_num.columns))}
sorted_features = sorted(scores,key=scores.get, reverse=True)
for feature in sorted_features:
    print('Feature %s has value %f'%(feature,scores[feature]))


Feature exercised_stock_options has value 29.133390
Feature total_stock_value has value 21.477343
Feature stock_sum has value 15.039523
Feature poi_email_ratio_to has value 13.360475
Feature bonus has value 11.437118
Feature money_total has value 10.334752
Feature salary has value 9.398674
Feature total_payments has value 7.734639
Feature restricted_stock has value 6.853888
Feature long_term_incentive has value 5.964237
Feature shared_receipt_with_poi has value 5.730789
Feature deferred_income has value 5.610048
Feature money_ratio has value 3.895578
Feature from_poi_to_this_person has value 3.036263
Feature email_ratio has value 2.035016
Feature other has value 1.908430
Feature from_this_person_to_poi has value 1.360849
Feature poi_email_ratio_from has value 1.161332
Feature from_messages has value 0.585913
Feature expenses has value 0.478571
Feature deferral_payments has value 0.380285
Feature to_messages has value 0.368235
Feature stock_ratio has value 0.013267

In [21]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.cross_validation import StratifiedShuffleSplit
import scipy
import warnings
warnings.filterwarnings('ignore')

gnb_clf = GridSearchCV(GaussianNB(),{})
#No params to tune for for linear bayes, use for convenience
        
svc_clf = SVC()
svc_search_params = {'C': scipy.stats.expon(scale=1), 
                     'gamma': scipy.stats.expon(scale=.1),
                     'kernel': ['linear','poly','rbf'],
                     'class_weight':['balanced',None]}
svc_search = RandomizedSearchCV(svc_clf, 
                                param_distributions=svc_search_params, 
                                n_iter=25)

tree_clf = DecisionTreeClassifier()
tree_search_params = {'criterion':['gini','entropy'],
                     'max_leaf_nodes':[None,25,50,100,1000],
                     'min_samples_split':[2,3,4],
                     'max_features':[0.25,0.5,0.75,1.0]}
tree_search = GridSearchCV(tree_clf, 
                           tree_search_params,
                           scoring='recall')

search_methods = [gnb_clf,svc_search,tree_search]
average_accuracies = [[0],[0],[0]]
average_precision = [[0],[0],[0]]
average_recall = [[0],[0],[0]]

num_splits = 10
train_split = 0.9
indices = list(StratifiedShuffleSplit(poi.tolist(),
                                      num_splits,
                                      test_size=1-train_split, 
                                      random_state=0))

best_features = None
max_score = 0
best_classifier = None
num_features = 0
for num_features in range(1,len(sorted_features)+1):
    features = sorted_features[:num_features]
    feature_df = enron_df_num[features]
    for classifier_idx in range(3): 
        sum_values = [0,0,0]
        #Only do parameter search once, too wasteful to do a ton
        search_methods[classifier_idx].fit(feature_df.iloc[indices[0][0],:],
                                           poi[indices[0][0]].tolist())
        classifier = search_methods[classifier_idx].best_estimator_
        for split_idx in range(num_splits): 
            train_indices, test_indices = indices[split_idx]
            train_data = (feature_df.iloc[train_indices,:],poi[train_indices].tolist())
            test_data = (feature_df.iloc[test_indices,:],poi[test_indices].tolist())
            classifier.fit(train_data[0],train_data[1])
            predicted = classifier.predict(test_data[0])
            sum_values[0]+=accuracy_score(predicted,test_data[1])
            sum_values[1]+=precision_score(predicted,test_data[1])
            sum_values[2]+=recall_score(predicted,test_data[1])
        avg_acc,avg_prs,avg_recall = [val/num_splits for val in sum_values]
        average_accuracies[classifier_idx].append(avg_acc)
        average_precision[classifier_idx].append(avg_prs)
        average_recall[classifier_idx].append(avg_recall)
        
        score = (avg_prs+avg_recall)/2
        if score>max_score and avg_prs>0.3 and avg_recall>0.3:
            max_score = score
            best_features = features
            best_classifier = search_methods[classifier_idx].best_estimator_
print('Best classifier found is %s \n\
       with score (recall+precision)/2 of %f\n\
       and feature set %s'%(str(best_classifier),max_score,best_features))


Best classifier found is DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=0.25, max_leaf_nodes=50, min_samples_leaf=1,
            min_samples_split=4, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best') 
       with score (recall+precision)/2 of 0.420000
       and feature set ['exercised_stock_options', 'total_stock_value', 'stock_sum', 'poi_email_ratio_to', 'bonus', 'money_total']

In [ ]: